# df = pd.read_excel(r'../ML/ML计算/特征过滤法.xlsx',sheet_name='0')
# df=pd.read_csv(r'small_loan.csv')
# df.drop('id', inplace=True, axis=1)
df = pd.read_csv(r'../01ML_Case/house_price/train.csv')
df.drop('Id', inplace=True, axis=1)
df.shape
df.head()
dqrs=DQReport(data=df, target='Y').SReport()
dqrs
# pde.dftohtml(dqrs)
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
# ytype='fenlei'#分类
ytype='huigui'#回归
df.rename(columns={'SalePrice':'Y'},inplace=True)
# df.rename(columns={'response':'Y'},inplace=True)
# df['Y'] = df['Y'].map({"YES": 1, "NO": 0})
if ytype=='fenlei':
df['Y']=df['Y'].astype(int)
else:
df['Y']=df['Y'].astype(float)
if ytype=='fenlei':
# 分类
dfc_str_col_plt_count(df,'Y')
else:
# 回归
print("Skewness: %f" % df['Y'].skew())
print("Kurtosis: %f" % df['Y'].kurt())
dfc_num_plt_huigui('Y',df)
dfc_num_col_plt_value(df)
# Ycut
df['Ycut']=pd.qcut(df.Y, 4)
df['Ycut']=df['Ycut'].astype(str)
print(df.Ycut.value_counts())
df = wrong_value_fillna(wrong_value=list(wvf_txt())).fit_transform(df)
# df['age']=df['age'].astype(float)
# df['income']=df['income'].astype(float)
# df['children']=df['children'].astype(str)
# df_fill_na['age']=df_fill_na['age'].astype(float)
# df_fill_na['income']=df_fill_na['income'].astype(float)
# float可能为int的
dfc_type_mcol_float_may_int(dfc_num_float,df)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
# df['children']=df['children'].astype(str)
# df_fill_na=xgb_fill().fit_transform(df.drop(columns=['Y']),df.Y) #无Y列出错
# df['age'].fillna(df['age'].mode()[0],inplace=True)
# df['age']=df['age'].astype(int)
dfc_null(df,0.05)
df.drop(columns=dfc_null(df,0.2).index,inplace=True)
df.shape
df.isnull().sum()[df.isnull().sum()>0]
dqrs=DQReport(data=df, target='Y').SReport()
dqrs
# pde.dftohtml(dqrs)
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
abnormal_value(df,dfc_num,dqrs)#print_value_list:bool=False
# df = fix_outlier(dfc_num,how='quartile').fit_transform(df) #dfc_num注意使用,有时候不需要全部数值列都处理
# df = fix_outlier([]).fit_transform(df) #自定义列表
# df = fix_outlier().fit_transform(df)
# abnormal_value(df,dfc_num,dqrs)
dfc_cycle_index=-1
# dfc_cycle_list=dfc_num+dfc_str
dfc_cycle_list=dfc_str+dfc_num
def dfc_cycle (dfc_cycle_list:list,df:pd.DataFrame,dqrs:pd.DataFrame):
global dfc_cycle_index
dfc_cycle_index+=1
if dfc_cycle_index < len(dfc_cycle_list):
if ytype=='fenlei':
if df.dtypes[dfc_cycle_list[dfc_cycle_index]]=='object':
print(dfc_str_plt_fenlei_huigui(dfc_cycle_list[dfc_cycle_index],df,label_col_name='Y')) #fenlei_huigui一样, label_col不一样
else:
print(abnormal_value(df,[dfc_cycle_list[dfc_cycle_index]],dqrs))
dfc_num_plt_fenlei(dfc_cycle_list[dfc_cycle_index],df) #分类
else:
if df.dtypes[dfc_cycle_list[dfc_cycle_index]]=='object':
print(dfc_str_plt_fenlei_huigui(dfc_cycle_list[dfc_cycle_index],df,label_col_name='Ycut'))#fenlei_huigui一样, label_col不一样
else:
print(abnormal_value(df,[dfc_cycle_list[dfc_cycle_index]],dqrs))
dfc_num_plt_huigui(dfc_cycle_list[dfc_cycle_index],df) #回归
else:
print('dfc_cycle complete. dfc_cycle_index:',dfc_cycle_index)
dfc_cycle_index=0
# dfc_cycle_list
dfc_cycle(dfc_cycle_list,df,dqrs)
# for x in df.x3.unique():
# s1=df[df.x3==x]['Y']
# print(x)
# print(s1.mean(),s1.std())
if ytype=='fenlei':
fea_imp_list_kf_str_fenlei=fea_imp_kf_str_fenlei(df)
else:
# 回归
fea_imp_list_kf_str_fenlei=fea_imp_kf_str_fenlei(df,'Ycut')
if ytype=='fenlei':
# 数值 vs 分类
fea_imp_list_num_ttest_anova_fenlei=fea_imp_num_ttest_anova_fenlei(df,dfc_num,'Y',fig_size=(8,8))
print(fea_imp_list_num_ttest_anova_fenlei)
else:
# 类别 vs 回归
fea_imp_list_str_ttest_anova_huigui=fea_imp_str_ttest_anova_huigui(df,dfc_str,fig_size=(8,8))
print(fea_imp_list_str_ttest_anova_huigui)
if ytype=='fenlei':
dfcorr=dfc_plt_corr(df,fig_size=(10,10),label_col_name=dfc_num[0])
else:
dfcorr=dfc_plt_corr(df,fig_size=(16,16))
dfcorr_largest(dfcorr,0.7)
# s=dfcorr[dfcorr>0.5].Y.dropna()
# s.drop(index='Y')
# s1=dfcorr[dfcorr>0.5].MSSubClass.dropna()
# s1=pd.DataFrame(s1.drop(index='MSSubClass'))
# s1
def spearman(frame, features,label_col_name='Y'):
spr = pd.DataFrame()
spr['feature'] = features
#Signature: a.corr(other, method='pearson', min_periods=None)
#Docstring:
#Compute correlation with `other` Series, excluding missing values
# 计算特征和 SalePrice的 斯皮尔曼 相关系数
spr['spearman'] = [frame[f].corr(frame[label_col_name], 'spearman') for f in features]
spr = spr.sort_values('spearman')
plt.figure(figsize=(6, 0.25*len(features))) # width, height
sns.barplot(data=spr, y='feature', x='spearman', orient='h')
spearman(df,dfc_num)
dfc_num_plt_huigui_gmap_pairplot(df,dfc_num)
DQReport(data=df, target='Y').SReport()
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
dftm=df.copy()
try:
dftm=dftm.drop(['Ycut'],axis=1)
except:
pass
dftm.head()
dftm=xgb_fill().fit_transform(dftm,dftm.Y) #无Y列出错
# dfc_null(dftm,0.001)
# dftm.isnull().sum().max()
dftm.isnull().sum().sum()
# dfc_type(dftm,True,False)[0]
# dftm['Y']=dftm['Y'].astype(int)
# dftm['Y'].value_counts()
# df['Y'] = np.where(df['Y'] == 'YES', 1, 0) #如果Y列不是0,1,2类型则记得转换
dftm['Embarked'].fillna(dftm['Embarked'].mode()[0],inplace=True)
DQReport(data=dftm, target='Y').SReport()
dqrs=DQReport(data=dftm, target='Y').SReport()
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
dftm_fea=dftm.loc[:,['Y']+['BsmtQual', 'WoodDeckSF', 'HeatingQC', 'Foundation', 'SaleCondition', 'BsmtExposure', 'MSSubClass', 'Condition1', '1stFlrSF', 'RoofStyle', 'BldgType', 'LotShape', 'TotalBsmtSF', 'YearRemodAdd', 'ExterQual', 'RoofMatl', 'MasVnrArea', 'OverallQual', 'Neighborhood', 'GarageArea', 'BsmtFinSF1', 'GarageYrBlt', 'SaleType', 'HouseStyle', 'LotArea', '2ndFlrSF', 'MSZoning', 'CentralAir', 'YearBuilt', 'BsmtFinType1', 'LotFrontage', 'YrSold', 'ExterCond', 'BsmtUnfSF', 'KitchenQual', 'OpenPorchSF', 'PavedDrive', 'Exterior1st', 'Exterior2nd', 'LotConfig', 'MoSold', 'LandContour', 'MasVnrType', 'GrLivArea', 'OverallCond', 'BsmtCond']]
dftm=df_get_dummies(dftm)
# dftm.head()
dftm_fea=df_get_dummies(dftm_fea)
x_train=dftm.drop(['Y'],axis=1)
try:
x_train=dftm.drop(['Ycut'],axis=1) # 回归前面有可能有Ycut
except:
pass
y_train=dftm['Y']
x_train_fea=dftm_fea.drop(['Y'],axis=1)
try:
x_train_fea=dftm_fea.drop(['Ycut'],axis=1) # 回归前面有可能有Ycut
except:
pass
y_train_fea=dftm_fea['Y']
# 分类
# tm=xgb.XGBClassifier().fit(x_train, y_train)
# 回归
tm=xgb.XGBRegressor().fit(x_train, y_train)
tm_plt=tm_plt_importance(tm)
fea_imp_list_xgb=tm_plt_importance_list(tm_plt)
top=20
s1=fea_imp_list_xgb[:top]
s2=fea_imp_list_kf_str_fenlei[0][:top]
if ytype=='fenlei':
s3=list(fea_imp_list_num_ttest_anova_fenlei.loc['feature',:])
else:
s3=list(fea_imp_list_str_ttest_anova_huigui.loc['feature',:])
s3=s3[:top]
print('xgb: ',len(s1))
print(s1)
print('')
if ytype=='fenlei':
print('kf, str: ',len(s2))
else:
print('kf, str. !!!回归的Ycut仅供参考: ',len(s2))
print(s2)
print('')
if ytype=='fenlei':
print('anova, num. 用于分类: ',len(s3))
print(s3)
else:
print('anova, str. 用于回归:', len(s3))
print(s3)
print('')
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
# print('重要的str')
# print(list(set(tpil_top10).intersection(dfc_str)))
# print('不重要的str')
# print(list(set(dfc_str).difference(tpil)))
# print('重要的num')
# print(list(set(tpil_top10).intersection(dfc_num)))
# print('不重要的num')
# print(list(set(dfc_num).difference(tpil)))
# xgb &卡方
s4=set(s1).intersection(s2)
print(s4)
# xgb & anova
s5=set(s1).intersection(s3)
print(s5)
# xgb & dfc_num
s6=set(s1).intersection(dfc_num)
print(len(s6))
print(s6)
s5.difference(s4)
set(s2).difference(s3)
set(s3).difference(s2)
set(s2).intersection(s3)
s=list(set(s1+s2+s3))
print(len(s))
print(s)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25,random_state=0)
x_train_fea,x_test_fea,y_train_fea,y_test_fea = train_test_split(x_train_fea,y_train_fea,test_size=0.25,random_state=0)
print(x_train.shape)
print(len(y_train))
print(x_test.shape)
print(len(y_test))
print(x_train_fea.shape)
print(len(y_train_fea))
print(x_test_fea.shape)
print(len(y_test_fea))
from sklearn.preprocessing import StandardScaler
#标准化,返回值为标准化后的数据
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
x_train_fea = std_x.fit_transform(x_train_fea)
x_test_fea = std_x.transform(x_test_fea)
#线性回归
from sklearn import linear_model
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
#gridsearch
params={'alpha':[0.001,0.01,0.1,1,10,100]}#粗调节
tm = Ridge()
tm=GridSearchCV(tm,params,cv=10)
tm.fit(x_train,y_train)
tm.best_params_
tm_names=[]
tm_scores_train=[]
tm_scores_test=[]
def tm_result_list(tm_name,tm_model):
tm_names.append(tm_name)
tm_scores_train.append(tm_model.score(x_train,y_train))
tm_scores_test.append(tm_model.score(x_test,y_test))
tm_result_list('线性回归_无筛选特征',tm)
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})
params={'alpha':[0.001,0.01,0.1,1,10,100]}#粗调节
tm_fea= Ridge()
tm_fea=GridSearchCV(tm_fea,params,cv=10)
tm_fea.fit(x_train_fea,y_train_fea)
tm_fea.best_params_
tm_names.append('线性回归_有筛选特征')
tm_scores_train.append(tm_fea.score(x_train_fea,y_train_fea))
tm_scores_test.append(tm_fea.score(x_test_fea,y_test_fea))
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})
tm = xgb.XGBRFRegressor()
params = {
'n_estimators':range(100,200,50),
'max_depth':range(2,15,4),
'learning_rate':np.linspace(0.01,3,5),
}
tm=GridSearchCV(tm,param_grid=params,cv=5)
tm.fit(x_train, y_train)
tm.best_params_
tm_result_list('xgboost_无筛选特征',tm)
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})
tm_fea = xgb.XGBRFRegressor()
params = {
'n_estimators':range(100,200,50),
'max_depth':range(2,15,4),
'learning_rate':np.linspace(0.01,3,5),
}
tm_fea=GridSearchCV(tm_fea,param_grid=params,cv=5)
tm_fea.fit(x_train_fea, y_train_fea)
tm_fea.best_params_
tm_names.append('xgboost_有筛选特征')
tm_scores_train.append(tm_fea.score(x_train_fea,y_train_fea))
tm_scores_test.append(tm_fea.score(x_test_fea,y_test_fea))
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})